package crawl;

import java.io.BufferedReader;
import java.io.FileReader;
import java.net.URL;

import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;

/**
 * 
 * 
 * 
 * @author Tobacco
 */

public class Crawl {
	public static void main(String[] args) throws Exception {
		BufferedReader br = new BufferedReader(new FileReader("C:\\Users\\Administrator\\Desktop\\123.txt"));
		String data = br.readLine();// 一次读入一行，直到读入null为文件结束
		while (data != null) {
			// System.out.println(data);
			ohYes(data);
			data = br.readLine(); // 接着读下一行
		}
	}

	private static void ohYes(String data) throws Exception {
		URL url = new URL(data);
		HtmlCleaner cleaner = new HtmlCleaner();
		TagNode node = cleaner.clean(url, "utf-8");
		Object[] titles = node.evaluateXPath("//h1[@class='title']");
		Object[] contents = node.evaluateXPath("//div[@class='rich-content']");

		
		for (Object title : titles) {
			System.out.println(((TagNode) title).getText());
			// tagNode).getAttributeByName("href"));
		}
		for (Object tagNode : contents) {
			System.out.println(((TagNode) tagNode).getText());
			// tagNode).getAttributeByName("href"));
		}

	}
}

// public static void main(String[] args) throws Exception
//
// {
//
// try {
// http://www.linxiaosheng.com/post/[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]/[0-9]*
// HtmlCleaner cleaner = new HtmlCleaner();

// URL url = new URL("http://www.linxiaosheng.com/archive");

// TagNode node = cleaner.clean(url, "gbk");
// <a class="post-meta" target="_blank"
// href="http://www.linxiaosheng.com/post/2013-06-07/40050336442">
// Object[] tagNodes = node.evaluateXPath("//a[@class='post-meta']");

// for (Object tagNode : tagNodes) {

// System.out.println(((TagNode) tagNode).getText());

// System.out.println(((TagNode) tagNode).getAttributeByName("href"));

// }

// }

// catch (Exception exception) {

// exception.printStackTrace();

// }

// }

// @SuppressWarnings("resource")
// public static void main(String[] args) throws Exception {
// BufferedReader br = new BufferedReader(new
// FileReader("C:\\Users\\Administrator\\Desktop\\123.txt"));
// String data = br.readLine();// 一次读入一行，直到读入null为文件结束
// while (data != null) {
// yes(data);
// data = br.readLine(); // 接着读下一行
// }
// }
//
// private static void yes(String s) throws Exception {
// Pattern pattern =
// Pattern.compile("http://www.linxiaosheng.com/post/[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]/.*");
// Matcher matcher = pattern.matcher(s);
// if (matcher.find()) {
// System.out.println(matcher.group().toString().substring(0,
// matcher.group().toString().length() - 2));
// }
// }
// }
